This document was compiled on the 2024-11-14 14:38:34.792318 by carol.
The data (Database_Hendrickx_2019_Dentes Poyos.xlsx) was obtained thanks to Elisabete Malafaia (EM), on the 31/07/2024, via external memory to Carolina Marques (CM).
The data contains the information of several parameters obtained from measurements of theropod teeth and most of them are explained in the following schemes:
All of the above schemes come from Hendrickx, Mateus, and Araújo (2015)
#data1 <- read_xlsx("Database_Hendrickx_2019_Dentes Poyos_Informacao idade.xlsx")
#dd<-data.table(Epoch=data1$Epoch,"Taxa (Genus)"=data1$`Taxa (Genus)`)
#dd<-dd[!duplicated(dd),]
data <- read_xlsx("Crown measurement dataset Kem Kem theropods.xlsx")
data[data == "?"] <- NA
data[data == "~"] <- NA
data[data == "/"] <- NA
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\? ","", x)
})
data[] <- lapply(data, function(x) {
gsub("absent",0, x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\?","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\>","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\<","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub(">","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("<","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("\\~","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub("~","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub(":","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
gsub(";","", x)
})
data$LIF<- ifelse(data$LIF=="6-7",6.5,
ifelse(data$LIF=="5-6",5.5,
ifelse(data$LIF=="4-5",4.5,
ifelse(data$LIF=="3-4","3.5",
ifelse(data$LIF=="11 or 12",11.5,
ifelse(data$LIF=="10-13",12,data$LIF))))))
data$CH<-data$CH...22
data<- data %>% select(-CH...60,-CH...22,-`(DDL/CH)*100`)
data1<-data
data<-data[,-c(1,2,5,4,6:14,16:19)]#until 19
#clade: 4, taxa:2, teethtaxa: 3, cladetteth: 5, epoch:15
#data<-inner_join(dd,data)
#data<-data[!duplicated(data),]
data$`TransvUndu`<-ifelse(data$`Transv. Undu.`!=0 & !is.na(data$`Transv. Undu.`),1,data$`Transv. Undu.`)
data$`Interdentsulci`<-ifelse(data$`Interdent. sulci`!=0 & !is.na(data$`Interdent. sulci`),1,data$`Interdent. sulci`)
data$LAF<-ifelse(data$LAF=="6-7",6.5,data$LAF)
data$CTU1 <- sub(".*?(\\d+).*", "\\1", data$CTU)
data<- data %>% select(-CTU,-`Interdent. sulci`,-`Transv. Undu.`)
# Convert columns to numeric, then create log-transformed columns
data <- data %>%
mutate(across(3:ncol(data), as.numeric)) %>%
mutate(across(3:ncol(data), log, .names = "Log_{.col}"))
data$TaxonToothtype<-as.factor(data$TaxonToothtype)
data$Epoch<-ifelse(data$Epoch=="'Middle Cretaceous'","Middle Cretaceous", data$Epoch)
data$Epoch<-as.factor(data$Epoch)
# Columns to be checked
#columns_to_check <- c("MA", "MC", "MB", "DA", "DC", "DB", "MAVG", "DAVG", "DSDI")
# Replace values equal to 100 with 0 in the specified columns
#data[columns_to_check] <- lapply(data[columns_to_check], function(x) {
# x[x == 100] <- 0
#return(x)
#})
#data$Taxa<-as.factor(paste0(data$`Taxa (Genus)`,data$Maturity,sep=" "))
data<-data.frame(data)## TaxonToothtype Epoch CBL
## Saurornitholestes lateral:133 Late Cretaceous :724 Min. : 0.380
## Tyrannosaurus lateral :114 Middle Cretaceous:238 1st Qu.: 4.282
## Acrocanthosaurus lateral : 48 Late Jurassic :205 Median : 9.950
## Richardoestesia lateral : 46 Early Cretaceous : 81 Mean :13.916
## Majungasaurus lateral : 41 Late Triassic : 57 3rd Qu.:19.782
## Pectinodon lateral : 40 Middle Jurassic : 55 Max. :54.500
## (Other) :949 (Other) : 11 NA's :3
## CBW AL CBR CHR
## Min. : 0.540 Min. : 0.55 Min. :0.2500 Min. :0.400
## 1st Qu.: 2.300 1st Qu.: 12.40 1st Qu.:0.4598 1st Qu.:1.634
## Median : 6.100 Median : 29.82 Median :0.5420 Median :1.908
## Mean : 9.074 Mean : 37.19 Mean :0.5920 Mean :1.935
## 3rd Qu.:13.430 3rd Qu.: 55.15 3rd Qu.:0.6895 3rd Qu.:2.191
## Max. :48.600 Max. :152.84 Max. :2.1841 Max. :4.222
## NA's :69 NA's :338 NA's :81 NA's :14
## MCL MCW MCR MDE
## Min. : 0.32 Min. : 0.940 Min. :0.3841 Min. :-13.880
## 1st Qu.: 6.69 1st Qu.: 4.480 1st Qu.:0.5000 1st Qu.: 0.000
## Median :12.13 Median : 7.390 Median :0.5745 Median : 0.000
## Mean :13.02 Mean : 8.274 Mean :0.6087 Mean : 5.636
## 3rd Qu.:17.98 3rd Qu.:10.960 3rd Qu.:0.6818 3rd Qu.: 8.360
## Max. :37.10 Max. :30.200 Max. :1.2792 Max. : 58.400
## NA's :876 NA's :906 NA's :907 NA's :1024
## MSL MEC LAF LIF
## Min. : 1.42 Min. : 0.00 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 17.55 1st Qu.: 68.74 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 28.02 Median :100.00 Median : 0.0000 Median : 0.0000
## Mean : 32.77 Mean : 83.93 Mean : 0.3948 Mean : 0.4748
## 3rd Qu.: 44.01 3rd Qu.:100.00 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :123.63 Max. :113.69 Max. :15.0000 Max. :15.0000
## NA's :1025 NA's :1031 NA's :739 NA's :755
## DMT DDT DLAT DLIT
## Min. :0.100 Min. : 0.100 Min. :0.100 Min. :0.100
## 1st Qu.:1.400 1st Qu.: 1.250 1st Qu.:1.000 1st Qu.:1.075
## Median :2.200 Median : 3.000 Median :2.400 Median :2.200
## Mean :2.888 Mean : 3.152 Mean :2.625 Mean :2.432
## 3rd Qu.:4.485 3rd Qu.: 4.255 3rd Qu.:4.000 3rd Qu.:3.345
## Max. :8.500 Max. :10.320 Max. :8.140 Max. :7.950
## NA's :1324 NA's :1324 NA's :1322 NA's :1324
## CA CA2 MA MC
## Min. : 8.50 Min. :-1.120 Min. : 4.66 Min. : 4.70
## 1st Qu.:68.27 1st Qu.:-0.080 1st Qu.: 9.00 1st Qu.: 9.25
## Median :83.22 Median : 0.010 Median :11.25 Median :12.00
## Mean :74.87 Mean : 0.002 Mean :13.76 Mean :16.09
## 3rd Qu.:86.28 3rd Qu.: 0.100 3rd Qu.:14.00 3rd Qu.:19.00
## Max. :88.11 Max. : 0.360 Max. :60.00 Max. :57.90
## NA's :1004 NA's :1026 NA's :888 NA's :590
## MB DA DC DB MAVG
## Min. : 6.00 Min. : 4.00 Min. : 0.00 Min. : 6.0 Min. : 0.00
## 1st Qu.:11.00 1st Qu.: 9.50 1st Qu.:10.24 1st Qu.:11.5 1st Qu.: 9.20
## Median :13.00 Median :12.00 Median :15.00 Median :14.8 Median :12.00
## Mean :14.25 Mean :13.89 Mean :17.79 Mean :16.5 Mean :14.44
## 3rd Qu.:16.00 3rd Qu.:15.00 3rd Qu.:21.06 3rd Qu.:18.5 3rd Qu.:16.08
## Max. :45.00 Max. :71.00 Max. :70.00 Max. :80.0 Max. :55.00
## NA's :1015 NA's :790 NA's :191 NA's :845 NA's :646
## DAVG DAVG2 TDD DSDI
## Min. : 1.56 Min. :-0.9200 Min. : 0.20 Min. : 0.6654
## 1st Qu.: 7.65 1st Qu.:-0.0940 1st Qu.: 36.00 1st Qu.: 0.9329
## Median :11.50 Median : 0.0100 Median : 56.80 Median : 1.0000
## Mean :13.69 Mean : 0.8887 Mean : 77.66 Mean : 3.7840
## 3rd Qu.:16.25 3rd Qu.: 0.1260 3rd Qu.:108.03 3rd Qu.: 1.1157
## Max. :80.00 Max. :35.0000 Max. :368.62 Max. :269.8500
## NA's :284 NA's :1056 NA's :293 NA's :737
## CMA CAA CDA MDL
## Min. : 16.11 Min. : 3.151 Min. : 17.79 Min. :0.0864
## 1st Qu.: 58.12 1st Qu.:23.483 1st Qu.: 80.08 1st Qu.:0.2632
## Median : 64.83 Median :27.066 Median : 86.92 Median :0.4167
## Mean : 64.58 Mean :27.332 Mean : 88.09 Mean :0.4079
## 3rd Qu.: 71.52 3rd Qu.:31.122 3rd Qu.: 94.37 3rd Qu.:0.5405
## Max. :148.96 Max. :74.262 Max. :160.74 Max. :1.0638
## NA's :443 NA's :438 NA's :439 NA's :590
## DDL ...61 CH TransvUndu
## Min. :0.07143 Min. : NA Min. : 0.570 Min. :0.0000
## 1st Qu.:0.23739 1st Qu.: NA 1st Qu.: 7.707 1st Qu.:0.0000
## Median :0.33333 Median : NA Median : 18.405 Median :1.0000
## Mean :0.36248 Mean :NaN Mean : 28.030 Mean :0.5149
## 3rd Qu.:0.48603 3rd Qu.: NA 3rd Qu.: 40.862 3rd Qu.:1.0000
## Max. :1.11111 Max. : NA Max. :145.550 Max. :1.0000
## NA's :191 NA's :1371 NA's :7 NA's :969
## Interdentsulci CTU1 Log_CBL Log_CBW
## Min. :0.0000 Min. : 0.0 Min. :-0.9676 Min. :-0.6162
## 1st Qu.:0.0000 1st Qu.: 0.0 1st Qu.: 1.4545 1st Qu.: 0.8329
## Median :0.0000 Median : 2.0 Median : 2.2976 Median : 1.8083
## Mean :0.4187 Mean : 1.5 Mean : 2.1924 Mean : 1.7243
## 3rd Qu.:1.0000 3rd Qu.: 3.0 3rd Qu.: 2.9848 3rd Qu.: 2.5975
## Max. :1.0000 Max. :10.0 Max. : 3.9982 Max. : 3.8836
## NA's :996 NA's :969 NA's :3 NA's :69
## Log_AL Log_CBR Log_CHR Log_MCL
## Min. :-0.5978 Min. :-1.3863 Min. :-0.9163 Min. :-1.139
## 1st Qu.: 2.5177 1st Qu.:-0.7769 1st Qu.: 0.4912 1st Qu.: 1.901
## Median : 3.3952 Median :-0.6125 Median : 0.6461 Median : 2.496
## Mean : 3.1984 Mean :-0.5745 Mean : 0.6339 Mean : 2.336
## 3rd Qu.: 4.0101 3rd Qu.:-0.3718 3rd Qu.: 0.7846 3rd Qu.: 2.889
## Max. : 5.0294 Max. : 0.7812 Max. : 1.4404 Max. : 3.614
## NA's :338 NA's :81 NA's :14 NA's :876
## Log_MCW Log_MCR Log_MDE Log_MSL
## Min. :-0.0619 Min. :-0.9570 Min. : -Inf Min. :0.3507
## 1st Qu.: 1.4996 1st Qu.:-0.6931 1st Qu.: -Inf 1st Qu.:2.8650
## Median : 2.0001 Median :-0.5543 Median : -Inf Median :3.3331
## Mean : 1.9078 Mean :-0.5228 Mean : -Inf Mean :3.2569
## 3rd Qu.: 2.3943 3rd Qu.:-0.3830 3rd Qu.:2.176 3rd Qu.:3.7843
## Max. : 3.4078 Max. : 0.2462 Max. :4.067 Max. :4.8173
## NA's :906 NA's :907 NA's :1044 NA's :1025
## Log_MEC Log_LAF Log_LIF Log_DMT
## Min. : -Inf Min. : -Inf Min. : -Inf Min. :-2.3026
## 1st Qu.:4.230 1st Qu.: -Inf 1st Qu.: -Inf 1st Qu.: 0.3365
## Median :4.605 Median : -Inf Median : -Inf Median : 0.7885
## Mean : -Inf Mean : -Inf Mean : -Inf Mean : 0.7166
## 3rd Qu.:4.605 3rd Qu.: -Inf 3rd Qu.: -Inf 3rd Qu.: 1.5007
## Max. :4.734 Max. :2.708 Max. :2.708 Max. : 2.1401
## NA's :1031 NA's :739 NA's :755 NA's :1324
## Log_DDT Log_DLAT Log_DLIT Log_CA
## Min. :-2.3026 Min. :-2.3026 Min. :-2.3026 Min. :2.140
## 1st Qu.: 0.2223 1st Qu.: 0.0000 1st Qu.: 0.0721 1st Qu.:4.223
## Median : 1.0986 Median : 0.8755 Median : 0.7885 Median :4.421
## Mean : 0.7985 Mean : 0.5680 Mean : 0.5162 Mean :4.272
## 3rd Qu.: 1.4478 3rd Qu.: 1.3863 3rd Qu.: 1.2074 3rd Qu.:4.457
## Max. : 2.3341 Max. : 2.0968 Max. : 2.0732 Max. :4.479
## NA's :1324 NA's :1322 NA's :1324 NA's :1004
## Log_CA2 Log_MA Log_MC Log_MB
## Min. : -Inf Min. :1.539 Min. :1.548 Min. :1.792
## 1st Qu.:-3.219 1st Qu.:2.197 1st Qu.:2.225 1st Qu.:2.398
## Median :-2.408 Median :2.420 Median :2.485 Median :2.565
## Mean : -Inf Mean :2.498 Mean :2.629 Mean :2.608
## 3rd Qu.:-1.897 3rd Qu.:2.639 3rd Qu.:2.944 3rd Qu.:2.773
## Max. :-1.022 Max. :4.094 Max. :4.059 Max. :3.807
## NA's :1182 NA's :888 NA's :590 NA's :1015
## Log_DA Log_DC Log_DB Log_MAVG
## Min. :1.386 Min. : -Inf Min. :1.792 Min. : -Inf
## 1st Qu.:2.251 1st Qu.:2.326 1st Qu.:2.442 1st Qu.:2.219
## Median :2.485 Median :2.708 Median :2.695 Median :2.485
## Mean :2.527 Mean : -Inf Mean :2.721 Mean : -Inf
## 3rd Qu.:2.708 3rd Qu.:3.047 3rd Qu.:2.918 3rd Qu.:2.778
## Max. :4.263 Max. :4.248 Max. :4.382 Max. :4.007
## NA's :790 NA's :191 NA's :845 NA's :646
## Log_DAVG Log_DAVG2 Log_TDD Log_DSDI
## Min. :0.4447 Min. : -Inf Min. :-1.609 Min. :-0.4074
## 1st Qu.:2.0347 1st Qu.:-2.996 1st Qu.: 3.584 1st Qu.:-0.0694
## Median :2.4423 Median :-2.278 Median : 4.040 Median : 0.0000
## Mean :2.3747 Mean : -Inf Mean : 4.039 Mean : 0.1028
## 3rd Qu.:2.7879 3rd Qu.:-1.561 3rd Qu.: 4.682 3rd Qu.: 0.1095
## Max. :4.3820 Max. : 3.555 Max. : 5.910 Max. : 5.5979
## NA's :284 NA's :1195 NA's :293 NA's :737
## Log_CMA Log_CAA Log_CDA Log_MDL
## Min. :2.779 Min. :1.148 Min. :2.879 Min. :-2.4493
## 1st Qu.:4.062 1st Qu.:3.156 1st Qu.:4.383 1st Qu.:-1.3350
## Median :4.172 Median :3.298 Median :4.465 Median :-0.8755
## Mean :4.149 Mean :3.275 Mean :4.467 Mean :-1.0193
## 3rd Qu.:4.270 3rd Qu.:3.438 3rd Qu.:4.547 3rd Qu.:-0.6152
## Max. :5.004 Max. :4.308 Max. :5.080 Max. : 0.0619
## NA's :443 NA's :438 NA's :439 NA's :590
## Log_DDL Log_...61 Log_CH Log_TransvUndu
## Min. :-2.6391 Min. : NA Min. :-0.5621 Min. :-Inf
## 1st Qu.:-1.4380 1st Qu.: NA 1st Qu.: 2.0422 1st Qu.:-Inf
## Median :-1.0986 Median : NA Median : 2.9126 Median : 0
## Mean :-1.1308 Mean :NaN Mean : 2.8310 Mean :-Inf
## 3rd Qu.:-0.7215 3rd Qu.: NA 3rd Qu.: 3.7102 3rd Qu.: 0
## Max. : 0.1054 Max. : NA Max. : 4.9805 Max. : 0
## NA's :191 NA's :1371 NA's :7 NA's :969
## Log_Interdentsulci Log_CTU1
## Min. :-Inf Min. : -Inf
## 1st Qu.:-Inf 1st Qu.: -Inf
## Median :-Inf Median :0.6931
## Mean :-Inf Mean : -Inf
## 3rd Qu.: 0 3rd Qu.:1.0986
## Max. : 0 Max. :2.3026
## NA's :996 NA's :969
lennn<-(ncol(data1_cleaned)-2)/2
taxa1<-table(data1_cleaned$TaxonToothtype)
data1_cleanedd<-data.frame(taxa1)
data1_cleanedd<-data1_cleanedd[order(data1_cleanedd$Freq, decreasing = T), ]
data1_cleanedd$ID<-1:nrow(data1_cleanedd)
data1_cleanedd$TaxonToothtype<-data1_cleanedd$Var1
data1_cleaned1<-data1_cleanedd[data1_cleanedd$Freq>lennn,]
data1_cleaned<-data1_cleaned[data1_cleaned$TaxonToothtype%in%unique(data1_cleaned1$TaxonToothtype),]
summary(data1_cleaned)## TaxonToothtype Epoch CBL
## Saurornitholestes lateral:133 Late Cretaceous :596 Min. : 1.400
## Tyrannosaurus lateral :106 Late Jurassic :135 1st Qu.: 4.775
## Richardoestesia lateral : 45 Middle Cretaceous:118 Median :10.910
## Acrocanthosaurus lateral : 42 Early Cretaceous : 40 Mean :15.165
## Majungasaurus lateral : 40 Late Triassic : 25 3rd Qu.:22.465
## Pectinodon lateral : 40 Middle Jurassic : 25 Max. :54.500
## (Other) :533 (Other) : 0
## CBW CBR CHR DC
## Min. : 0.600 Min. :0.2500 Min. :0.400 Min. : 4.50
## 1st Qu.: 2.200 1st Qu.:0.4480 1st Qu.:1.634 1st Qu.:10.00
## Median : 5.700 Median :0.5200 Median :1.905 Median :15.00
## Mean : 9.409 Mean :0.5676 Mean :1.913 Mean :17.36
## 3rd Qu.:14.480 3rd Qu.:0.6516 3rd Qu.:2.179 3rd Qu.:20.20
## Max. :48.600 Max. :2.1840 Max. :3.509 Max. :60.00
##
## DDL CH Log_CBL Log_CBW
## Min. :0.08333 Min. : 2.20 Min. :0.3365 Min. :-0.5108
## 1st Qu.:0.24752 1st Qu.: 8.50 1st Qu.:1.5634 1st Qu.: 0.7885
## Median :0.33333 Median : 19.10 Median :2.3897 Median : 1.7405
## Mean :0.36771 Mean : 30.48 Mean :2.3158 Mean : 1.6997
## 3rd Qu.:0.50000 3rd Qu.: 45.33 3rd Qu.:3.1120 3rd Qu.: 2.6728
## Max. :1.11111 Max. :145.55 Max. :3.9982 Max. : 3.8836
##
## Log_CBR Log_CHR Log_DC Log_DDL
## Min. :-1.3863 Min. :-0.9163 Min. :1.504 Min. :-2.4849
## 1st Qu.:-0.8029 1st Qu.: 0.4912 1st Qu.:2.303 1st Qu.:-1.3962
## Median :-0.6539 Median : 0.6444 Median :2.708 Median :-1.0986
## Mean :-0.6149 Mean : 0.6260 Mean :2.723 Mean :-1.1136
## 3rd Qu.:-0.4283 3rd Qu.: 0.7787 3rd Qu.:3.006 3rd Qu.:-0.6931
## Max. : 0.7812 Max. : 1.2554 Max. :4.094 Max. : 0.1054
##
## Log_CH
## Min. :0.7885
## 1st Qu.:2.1401
## Median :2.9497
## Mean :2.9494
## 3rd Qu.:3.8140
## Max. :4.9805
##
# Select variables that contain "log" and the first column
selected_cols <- c(1,2, grep("Log", names(data1_cleaned)))
# Subset the data frame
data_log <- data1_cleaned[, selected_cols]
data_lognames(data_log)[-1] <- gsub(" ", "_", names(data_log)[-1] )
# Identify columns that contain "log"
log_cols <- grep("Log", names(data1_cleaned))
# Include the first column
cols_to_keep <- setdiff(1:ncol(data1_cleaned), log_cols)
# Ensure the first column is included
cols_to_keep <- union(1, cols_to_keep)
# Subset the data frame
data_original <- data1_cleaned[, cols_to_keep]
data_originalcorrelation_matrix <- cor(data_original[, -c(1, 2)])
# Plot correlation matrix
corrplot(correlation_matrix, method = "color", type = "lower",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
diag = FALSE,
order = "hclust",
col = colorRampPalette(c("blue", "white", "red"))(200))correlation_matrix <- cor(data_log[, -c(1, 2)])
# Plot correlation matrix
corrplot(correlation_matrix, method = "color", type = "lower",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
diag = FALSE,
order = "hclust",
col = colorRampPalette(c("blue", "white", "red"))(200))## [1] "TaxonToothtype" "Epoch" "CBL" "CBW"
## [5] "CBR" "CHR" "DC" "DDL"
## [9] "CH"
# Filter out taxa with less than 10 observations
data_log <- data_log %>%
group_by(TaxonToothtype) %>% # Group by 'Taxa' column
filter(n() >= 10) %>% # Keep only groups with 10 or more observations
ungroup() # Ungroup after filtering
# Filter out taxa with less than 10 observations
data_original <- data_original %>%
group_by(TaxonToothtype) %>% # Group by 'Taxa' column
filter(n() >= 10) %>% # Keep only groups with 10 or more observations
ungroup()
# Get the count of each unique value in the column
category_counts <- table(data_log$TaxonToothtype)
# Filter unique values that have more than 0 observations
unique_values <- names(category_counts[category_counts > 0])
# Print the result
#print(unique_values)data1$TaxonToothtype<-ifelse(!(data1$TaxonToothtype%in%unique_values),data1$`Taxa (Genus)`,data1$TaxonToothtype)
data1<-data1[,-c(1,2,5,4,6:14,16:19)]#until 19
#clade: 4, taxa:2, teethtaxa: 3, cladetteth: 5, epoch:15
#data<-inner_join(dd,data)
#data<-data[!duplicated(data),]
data1$`TransvUndu`<-ifelse(data1$`Transv. Undu.`!=0 & !is.na(data1$`Transv. Undu.`),1,data1$`Transv. Undu.`)
data1$`Interdentsulci`<-ifelse(data1$`Interdent. sulci`!=0 & !is.na(data1$`Interdent. sulci`),1,data1$`Interdent. sulci`)
data1$LAF<-ifelse(data1$LAF=="6-7",6.5,data1$LAF)
data1$CTU1 <- sub(".*?(\\d+).*", "\\1", data1$CTU)
data1<- data1 %>% select(-CTU,-`Interdent. sulci`,-`Transv. Undu.`)
# Convert columns to numeric, then create log-transformed columns
data1 <- data1 %>%
mutate(across(3:ncol(data1), as.numeric)) %>%
mutate(across(3:ncol(data1), log, .names = "Log_{.col}"))## Warning: There were 17 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(3:ncol(data1), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 16 remaining warnings.
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(3:ncol(data1), log, .names = "Log_{.col}")`.
## Caused by warning:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
data1$TaxonToothtype<-as.factor(data1$TaxonToothtype)
data1$Epoch<-ifelse(data1$Epoch=="'Middle Cretaceous'","Middle Cretaceous", data1$Epoch)
data1$Epoch<-as.factor(data1$Epoch)
#data$Taxa<-as.factor(paste0(data$`Taxa (Genus)`,data$Maturity,sep=" "))
data1<-data.frame(data1)
# Count the number of missing values in each column
missing_counts <- colSums(is.na(data1))
# Remove columns with more than 15% missing values
data1_cleaned <- data1[, missing_counts <= nrow(data1)*0.15]
# Remove rows with any NA values
data1_cleaned <- na.omit(data1_cleaned)
data1_cleanedlennn<-(ncol(data1_cleaned)-2)/2
taxa1<-table(data1_cleaned$TaxonToothtype)
data1_cleanedd<-data.frame(taxa1)
data1_cleanedd<-data1_cleanedd[order(data1_cleanedd$Freq, decreasing = T), ]
data1_cleanedd$ID<-1:nrow(data1_cleanedd)
data1_cleanedd$TaxonToothtype<-data1_cleanedd$Var1
data1_cleaned1<-data1_cleanedd[data1_cleanedd$Freq>lennn,]
data1_cleaned<-data1_cleaned[data1_cleaned$TaxonToothtype%in%unique(data1_cleaned1$TaxonToothtype),]
summary(data1_cleaned)## TaxonToothtype Epoch CBL
## Saurornitholestes lateral:133 Late Cretaceous :601 Min. : 1.40
## Tyrannosaurus lateral :106 Middle Cretaceous:149 1st Qu.: 4.89
## Richardoestesia lateral : 45 Late Jurassic :137 Median :11.09
## Acrocanthosaurus lateral : 42 Early Cretaceous : 40 Mean :15.11
## Majungasaurus lateral : 40 Late Triassic : 29 3rd Qu.:21.99
## Pectinodon lateral : 40 Middle Jurassic : 28 Max. :54.50
## (Other) :578 (Other) : 0
## CBW CBR CHR DC
## Min. : 0.600 Min. :0.2500 Min. :0.400 Min. : 4.50
## 1st Qu.: 2.220 1st Qu.:0.4490 1st Qu.:1.646 1st Qu.:10.00
## Median : 5.940 Median :0.5233 Median :1.916 Median :15.00
## Mean : 9.342 Mean :0.5698 Mean :1.921 Mean :17.25
## 3rd Qu.:14.287 3rd Qu.:0.6571 3rd Qu.:2.184 3rd Qu.:20.00
## Max. :48.600 Max. :2.1840 Max. :3.509 Max. :60.00
##
## DDL CH Log_CBL Log_CBW
## Min. :0.08333 Min. : 2.200 Min. :0.3365 Min. :-0.5108
## 1st Qu.:0.25000 1st Qu.: 8.678 1st Qu.:1.5872 1st Qu.: 0.7975
## Median :0.33333 Median : 19.845 Median :2.4056 Median : 1.7817
## Mean :0.36781 Mean : 30.396 Mean :2.3223 Mean : 1.7102
## 3rd Qu.:0.50000 3rd Qu.: 44.237 3rd Qu.:3.0907 3rd Qu.: 2.6594
## Max. :1.11111 Max. :145.550 Max. :3.9982 Max. : 3.8836
##
## Log_CBR Log_CHR Log_DC Log_DDL
## Min. :-1.3863 Min. :-0.9163 Min. :1.504 Min. :-2.4849
## 1st Qu.:-0.8008 1st Qu.: 0.4981 1st Qu.:2.303 1st Qu.:-1.3863
## Median :-0.6477 Median : 0.6502 Median :2.708 Median :-1.0986
## Mean :-0.6109 Mean : 0.6308 Mean :2.720 Mean :-1.1101
## 3rd Qu.:-0.4199 3rd Qu.: 0.7813 3rd Qu.:2.996 3rd Qu.:-0.6931
## Max. : 0.7812 Max. : 1.2554 Max. :4.094 Max. : 0.1054
##
## Log_CH
## Min. :0.7885
## 1st Qu.:2.1607
## Median :2.9880
## Mean :2.9603
## 3rd Qu.:3.7896
## Max. :4.9805
##
# Select variables that contain "log" and the first column
selected_cols <- c(1,2, grep("Log", names(data1_cleaned)))
# Subset the data frame
data_log <- data1_cleaned[, selected_cols]
data_lognames(data_log)[-1] <- gsub(" ", "_", names(data_log)[-1] )
# Identify columns that contain "log"
log_cols <- grep("Log", names(data1_cleaned))
# Include the first column
cols_to_keep <- setdiff(1:ncol(data1_cleaned), log_cols)
# Ensure the first column is included
cols_to_keep <- union(1, cols_to_keep)
# Subset the data frame
data_original <- data1_cleaned[, cols_to_keep]
data_original## [1] "TaxonToothtype" "Epoch" "CBL" "CBW"
## [5] "CBR" "CHR" "DC" "DDL"
## [9] "CH"
# Filter out taxa with less than 10 observations
data_log <- data_log %>%
group_by(TaxonToothtype) %>% # Group by 'Taxa' column
filter(n() >= 10) %>% # Keep only groups with 10 or more observations
ungroup() # Ungroup after filtering
# Filter out taxa with less than 10 observations
data_original <- data_original %>%
group_by(TaxonToothtype) %>% # Group by 'Taxa' column
filter(n() >= 10) %>% # Keep only groups with 10 or more observations
ungroup()
# Get the count of each unique value in the column
category_counts <- table(data_log$TaxonToothtype)
# Filter unique values that have more than 0 observations
unique_values1 <- names(category_counts[category_counts > 0])